{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "first-sequence.ipynb",
      "provenance": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/lmoroney/tfbook/blob/master/chapter5/first-sequence.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "zX4Kg8DUTKWO",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "# you may not use this file except in compliance with the License.\n",
        "# You may obtain a copy of the License at\n",
        "#\n",
        "# https://www.apache.org/licenses/LICENSE-2.0\n",
        "#\n",
        "# Unless required by applicable law or agreed to in writing, software\n",
        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "# See the License for the specific language governing permissions and\n",
        "# limitations under the License."
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "D1J15Vh_1Jih",
        "colab_type": "code",
        "cellView": "both",
        "colab": {}
      },
      "source": [
        "try:\n",
        "  # %tensorflow_version only exists in Colab.\n",
        "  %tensorflow_version 2.x\n",
        "except Exception:\n",
        "  pass\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BOjujz601HcS",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import tensorflow as tf\n",
        "from tensorflow import keras\n",
        "from tensorflow.keras.preprocessing.text import Tokenizer"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "gS8kpOnPSmC7",
        "colab_type": "text"
      },
      "source": [
        "# Initial Tokenization"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "OO4yo8VPodVH",
        "colab": {}
      },
      "source": [
        "sentences = [\n",
        "    'Today is a sunny day',\n",
        "    'Today is a rainy day',\n",
        "    'Is it sunny today?'\n",
        "]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "5pNQboo9ofT_",
        "colab": {}
      },
      "source": [
        "# Initial tokenization of the corpus, no OOV used\n",
        "tokenizer = Tokenizer(num_words = 100)\n",
        "tokenizer.fit_on_texts(sentences)\n",
        "word_index = tokenizer.word_index\n",
        "\n",
        "sequences = tokenizer.texts_to_sequences(sentences)\n",
        "print(word_index)\n",
        "print(sequences)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "bGLOKPhxSqM8",
        "colab_type": "text"
      },
      "source": [
        "# Exploring Test Data with unseen words"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "sN1UtFMAOvPC",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "test_data = [\n",
        "  'Today is a snowy day',\n",
        "  'Will it be rainy tomorrow?'\n",
        "]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "GxWeO0MCPRzh",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "test_sequences = tokenizer.texts_to_sequences(test_data)\n",
        "print(word_index)\n",
        "print(test_sequences)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "smKv0AsKSwJD",
        "colab_type": "text"
      },
      "source": [
        "# Adding OOV to improve test data sequences"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vH7dM5eAQAgp",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# Here you can re-tokenize with an OOV token\n",
        "tokenizer = Tokenizer(num_words = 100, oov_token=\"<OOV>\")\n",
        "tokenizer.fit_on_texts(sentences)\n",
        "word_index = tokenizer.word_index\n",
        "\n",
        "sequences = tokenizer.texts_to_sequences(sentences)\n",
        "\n",
        "test_sequences = tokenizer.texts_to_sequences(test_data)\n",
        "print(word_index)\n",
        "print(test_sequences)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zqV2LI_5S2FK",
        "colab_type": "text"
      },
      "source": [
        "# Exploring Padding"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "G3eRc_1PSjjq",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from tensorflow.keras.preprocessing.sequence import pad_sequences"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "SkN6lho-SLtE",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "sentences = [\n",
        "    'Today is a sunny day',\n",
        "    'Today is a rainy day',\n",
        "    'Is it sunny today?',\n",
        "    'I really enjoyed walking in the snow today'\n",
        "]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Xr2nWUqgSWJz",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# Re-tokenize with the new sentences from above\n",
        "tokenizer = Tokenizer(num_words = 100, oov_token=\"<OOV>\")\n",
        "tokenizer.fit_on_texts(sentences)\n",
        "word_index = tokenizer.word_index\n",
        "\n",
        "sequences = tokenizer.texts_to_sequences(sentences)\n",
        "print(sequences)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "KthO_WMcTtDk",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "padded = pad_sequences(sequences)\n",
        "\n",
        "print(padded)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "WIENbRT6U2Ut",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "padded = pad_sequences(sequences, padding='post')\n",
        "\n",
        "print(padded)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xwnHlf1-WFGF",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "padded = pad_sequences(sequences, padding='post', maxlen=6)\n",
        "\n",
        "print(padded)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "MwaIUiDKW8R2",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "padded = pad_sequences(sequences, padding='post', maxlen=6, truncating='post')\n",
        "\n",
        "print(padded)"
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}